Generalities

Import required libraries

if (!require("ggplot2")) install.packages("ggplot2")
if (!require("stringr")) install.packages("stringr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("dplyr")) install.packages("dplyr")
if (!require("patchwork")) install.packages("patchwork")
if (!require("BiocManager")) install.packages("BiocManager")
if (!require("ggbreak")) install.packages("ggbreak")

if (!require("clusterProfiler")) BiocManager::install("clusterProfiler")
if (!require("AnnotationDbi")) BiocManager::install("AnnotationDbi")
if (!require("org.Hs.eg.db")) BiocManager::install("org.Hs.eg.db")
if (!require("org.Hs.eg.db")) BiocManager::install("DESeq2")

if (!require("edgeR")) BiocManager::install("edgeR")

library(ggplot2)
library(stringr)
library(ggrepel)
library(dplyr)
library(patchwork)
library(clusterProfiler)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DESeq2)
library(plotly)
library(ggbreak) 

library(edgeR)
CoV_mean <- function(arg_exp){
  mean_values_ <- rowMeans(arg_exp)
  CoV_values_ <- apply(arg_exp, 1, function(row) sd(row) / mean(row))
  stats_Experiment <- cbind(mean_values_, CoV_values_)
  colnames(stats_Experiment) <- c("Mean_expression",
                                  "CoV")
  return(as.data.frame(stats_Experiment))
}
plot_CoV_mean <- function(arg_dataframe, arg_x, arg_y,
                          arg_lab_title = "title", arg_lab_x = "x", arg_lab_y = "y",
                          arg_color = "blue"){
  ggplot(arg_dataframe, aes_string(x = arg_x, y = arg_y, label=c("Gene"))) +
    geom_point(color = arg_color, alpha = 0.4) +
    labs(x = arg_lab_x, y = arg_lab_y, title = arg_lab_title)+
    theme_light()+
    theme(plot.title = element_text(size = 12, hjust=0.5))+
    xlim(2.5, 15)+
    ylim(-0.01, 0.5)
}

Import dataset with gene counts

genes_list <- read.delim(file.path(project_dir, "GSE107593_raw_reads_BCHRNAseq.txt"), check.names=F)

Raw counts matrix

colnames(genes_list) <- c("ENSG_symbol", colnames(genes_list[2:length(colnames(genes_list))]))
genes_list[,c(1,3,10:ncol(genes_list))]

Obtaining metadata information

Patient IDs

# Read GSE107593_series_matrix.txt
metadata_matrix <- read.delim(file.path(project_dir, "Dataset copy, untouched", "GSE107593_series_matrix.txt"), check.names=F, skip = 25)
metadata_matrix_2 <- data.frame(t(metadata_matrix[,2:ncol(metadata_matrix)]))
colnames(metadata_matrix_2) <- metadata_matrix[,1]

# Obtain patient ids
patient_ids <- unique(str_replace(metadata_matrix_2[,12], "subject: ",""))

for (el in patient_ids){
  cat(paste0(el,"\n"))
}
157
877
1057
1077
1192
1214
8854
8855
8874
8878
8879
8881

Area from which samples were obtained

# Obtain sampling locations
location <- unique(str_replace(metadata_matrix_2[,11], "location: ",""))

for (el in sort(location)){
  cat(paste0(el,"\n"))
}
Ascending
Descending
Large
Rectum
Sigmoid
Tranverse

Lognormalization of counts

# Create df for normalization:
genes_list_lognorm <- data.frame(matrix(NA, nrow = nrow(genes_list), ncol = ncol(genes_list)))

# Columns before counts
for (col in 1:9){
  genes_list_lognorm[[col]] <- genes_list[[col]]
}

# Column names
colnames(genes_list_lognorm) <- colnames(genes_list)

for (i in 10:ncol(genes_list)) {
  genes_list_lognorm[, i] <- log2((genes_list[, i] / colSums(genes_list[i]) * 1000000)+8)
}

genes_list_lognorm[,c(1,3,10:ncol(genes_list))]

Most variable genes

Mean vs CoV for all genes

df_mean_vs_CoV <- genes_list_lognorm[,10:ncol(genes_list)]
stats_ <- CoV_mean(df_mean_vs_CoV)
stats_$ENSG <- genes_list_lognorm[[1]]
stats_$Gene <- genes_list_lognorm[[3]]
p1 <- plot_CoV_mean(stats_, arg_x = "Mean_expression", arg_y = "CoV", arg_lab_title = "Mean vs CoV, all samples", arg_lab_x = "Mean expression across samples", arg_lab_y = "Coefficient of Variation")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
p1
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

Mean vs CoV for most variable genes

most_variable_genes <- (stats_ %>% arrange(desc(CoV)))[1:500,]
p2 <- plot_CoV_mean(most_variable_genes, arg_x = "Mean_expression", arg_y = "CoV", arg_lab_title = "Mean vs CoV, all samples, 500 most variable genes", arg_lab_x = "Mean expression across samples", arg_lab_y = "Coefficient of Variation")

p1plotly <- ggplotly(p2)
p1plotly

Principal component analysis (PCA)

PCA with all genes

Scree plot

# Create dataframe for PCA of lognormalized samples
genes_list_lognorm_for_pca <- t(genes_list_lognorm[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_for_pca) <- rownames(genes_list_lognorm)

# PCA of lognorm counts
pca_lognorm <- prcomp(genes_list_lognorm_for_pca)

# Screeplot
pca_lognorm_metrics <- as.data.frame(t(summary(pca_lognorm)$importance))
colnames(pca_lognorm_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_lognorm_metrics$PCs <- as.numeric(substr(rownames(pca_lognorm_metrics), 3, length(rownames(pca_lognorm_metrics))))

ggplot(data = pca_lognorm_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
  geom_line(color="blue") +
  geom_point() +
  scale_x_continuous(breaks = c(1:10)) +
  labs(title="PCA of lognormalized counts - Variance explained per principal component")+
  xlab("Principal component")+
  ylab("Proportion of variance explained")+
  theme_light()+
  theme(plot.title = element_text(size=12),
        axis.title.x = element_text(size = 11),
        axis.title.y = element_text(size = 11),
        text = element_text(size=11),
        aspect.ratio = 1/1.5)

# Create dataframe to contain relevant metadata in appropriate format
df_samples_treatments <- data.frame(row.names = rownames(metadata_matrix_2))

# Get sample name and inflammation status
df_samples_treatments$sample_inflamation <- ""
for (row in (1:nrow(metadata_matrix_2))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- str_split(
    rownames(df_samples_treatments)[row],
    "Ulcerative colitis Colon Biopsy ",
    simplify = T)[2]}

# Get sample name
df_samples_treatments$sample <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(" [^ ]*$", "", df_samples_treatments[row, ncol(df_samples_treatments)-1])
}

# Get inflammation status
df_samples_treatments$inflammation <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(".* ", "", df_samples_treatments[row, ncol(df_samples_treatments)-2])
}

# Get sample location
df_samples_treatments$location <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, "location"] <- gsub("location: ", "", metadata_matrix_2[row, 11])
}

# Get sampling hospital
df_samples_treatments$hospital <- ""
for (row in (1:nrow(df_samples_treatments))){
  df_samples_treatments[row, "hospital"] <- gsub("site: ", "", metadata_matrix_2[row, 10])
}

# Create pca scores dataframe for lognormalized counts pca
pca_scores_lognorm <- data.frame(pca_lognorm$x[,1:10])
pca_scores_lognorm$patient <- ""

for (row in (1:nrow(pca_scores_lognorm))){
  for (id_patient in 1:length(patient_ids)){
    if(!is.na(str_extract(rownames(pca_scores_lognorm[row,]), patient_ids[id_patient]))){
      pca_scores_lognorm[row, ncol(pca_scores_lognorm)] <- str_extract(rownames(pca_scores_lognorm[row,]), patient_ids[id_patient])
    }
  }
}

# Get the sample id in the PCA scores dataframe
pca_scores_lognorm$sample <- rownames(pca_scores_lognorm)

# Get the inflammation status of each sample
pca_scores_lognorm$inflammation <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores_lognorm[row_scores, "inflammation"] <- df_samples_treatments[row_df, "inflammation"]
    }
  }
}

# Get the sample location in the PCA scores dataframe
pca_scores_lognorm$location <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores_lognorm[row_scores, "location"] <- df_samples_treatments[row_df, "location"]
    }
  }
}

# Get the sampling hospital in the PCA scores dataframe
pca_scores_lognorm$hospital <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
  for(row_df in 1:nrow(df_samples_treatments)){
    if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
      pca_scores_lognorm[row_scores, "hospital"] <- df_samples_treatments[row_df, "hospital"]
    }
  }
}

PCA Scores

p1 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = inflammation)) +
  geom_point(size=2)+
  labs(title = "Inflammation status")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=1))

p2 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = location)) +
  geom_point(size=2)+
  labs(title = "Sample location")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=2))

p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
                      subtitle = "Counts log-normalized",
                      theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
                                    plot.subtitle = element_text(hjust = 0.5, vjust = 3)))

ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = patient, shape = inflammation, label=patient)) +
  geom_point(size=3)+
  geom_text_repel(nudge_x = 2)+
  labs(title = "PCA scores by patient ID and inflammation status")+
  theme_light()+
  theme(plot.title = element_text(hjust=0.5))

PCA with 500 most variable genes

# Create dataframe for PCA of lognormalized samples using 500 most variable genes
genes_list_lognorm_most_var_for_pca <- t((genes_list_lognorm %>% filter(ENSG_symbol %in% most_variable_genes$ENSG))[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_most_var_for_pca) <- most_variable_genes$ENSG

# PCA of lognorm counts
pca_lognorm_most_var <- prcomp(genes_list_lognorm_most_var_for_pca)

pca_scores_lognorm_most_var_metrics <- data.frame(pca_lognorm_most_var$x[,1:10])
pca_scores_lognorm_most_var_metrics$patient <- pca_scores_lognorm$patient
pca_scores_lognorm_most_var_metrics$sample <- pca_scores_lognorm$sample
pca_scores_lognorm_most_var_metrics$inflammation <- pca_scores_lognorm$inflammation
pca_scores_lognorm_most_var_metrics$location <- pca_scores_lognorm$location
pca_scores_lognorm_most_var_metrics$hospital <- pca_scores_lognorm$hospital

PCA Scores

p1 <- ggplot(pca_scores_lognorm_most_var_metrics, aes(x=PC1, y=PC2, colour = inflammation)) +
  geom_point(size=2)+
  labs(title = "Inflammation status")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=1))

p2 <- ggplot(pca_scores_lognorm_most_var_metrics, aes(x=PC1, y=PC2, colour = location)) +
  geom_point(size=2)+
  labs(title = "Sample location")+
  theme_light()+
  theme(legend.position = "bottom",
        plot.title = element_text(hjust = 0.5),
        aspect.ratio = 1)+
  guides(colour=guide_legend(ncol=2))

p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
                      subtitle = "Counts log-normalized, PCA over 500 Most variable genes",
                      theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
                                    plot.subtitle = element_text(hjust = 0.5, vjust = 3)))

# Extract the rotations for pcas with lognormalized counts, all genes
pca_rotations <- data.frame(pca_lognorm$rotation[,1:10])
pca_lognorm_rotations <- data.frame(pca_lognorm$rotation[,1:10])

# Add column with gene names to rotations dataframes
pca_rotations$gene_name <- genes_list$gene_name
pca_lognorm_rotations$gene_name <- genes_list$gene_name

GO enrichment PC1 loadings

# Show genes with higher rotation values for pca (raw counts)
top_pos_pca_lognorm <- pca_lognorm_rotations %>% arrange(desc(PC1)) %>% dplyr::select(c(PC1, gene_name))
top20_pos_pca_lognorm <- top_pos_pca_lognorm[1:20,]
top20_pos_pca_lognorm$gene_name
 [1] "IGHG1"    "DUOX2"    "IGHG3"    "PI3"      "DUOXA2"   "IGHG2"   
 [7] "LCN2"     "CHI3L1"   "CXCL1"    "IGHG4"    "NOS2"     "REG1A"   
[13] "SLC6A14"  "MMP3"     "C3"       "DMBT1"    "IGHV4-34" "REG4"    
[19] "PLA2G2A"  "IGFBP5"  
GO_PC1_pos <- enrichGO(gene = top20_pos_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_PC1_pos <- as.data.frame(GO_PC1_pos)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_PC1_pos$Description <- strtrim(temp_GO_PC1_pos$Description, 50)
rownames(temp_GO_PC1_pos) <- 1:nrow(temp_GO_PC1_pos)
temp_GO_PC1_pos
# Show genes with more negative rotation values for pca (raw counts)
top_neg_pca_lognorm <- pca_lognorm_rotations %>% arrange(PC1) %>% dplyr::select(c(PC1, gene_name))
top20_neg_pca_lognorm <- top_neg_pca_lognorm[1:20,]
top20_neg_pca_lognorm$gene_name
 [1] "AQP8"     "HMGCS2"   "SLC26A2"  "CA1"      "ANPEP"    "PCK1"    
 [7] "CHP2"     "B4GALNT2" "GUCA2A"   "ADH1C"    "PADI2"    "SLC26A3" 
[13] "ABCG2"    "FABP1"    "CA2"      "LYPD8"    "CKB"      "SLC51A"  
[19] "UGT2A3"   "SLC9A3"  
GO_PC1_neg <- enrichGO(gene = top20_neg_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_PC1_neg <- as.data.frame(GO_PC1_neg)[,c("ID", "Description", "GeneRatio", "p.adjust")]
rownames(temp_GO_PC1_neg) <- 1:nrow(temp_GO_PC1_neg)
temp_GO_PC1_neg

Differential expression analysis

cts <- as.matrix(genes_list[,c(10:ncol(genes_list))])
rownames(cts) <- genes_list[[3]]

coldata <- pca_scores_lognorm[,c(11, 13:15)]
coldata$inflammation <- factor(coldata$inflammation)
coldata$location <- factor(coldata$location)
coldata$hospital <- factor(coldata$hospital)

cts <- cts[, rownames(coldata)]

as.data.frame(cts)
coldata
dds <- suppressMessages(DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ inflammation))
smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
dds$inflammation <- relevel(dds$inflammation, ref = "Non-Inflamed")
dds <- suppressMessages(DESeq(dds))
res <- results(dds)
gene_list <- as.data.frame(res)
gene_list$Gene <- rownames(gene_list)
gene_list %>% filter(padj<0.05) %>% arrange(padj)
gene_list$significance <- NA
for (gene in 1:nrow(gene_list)){
  if (gene_list[gene,"log2FoldChange"]>=1 & gene_list[gene,"padj"]< 0.05){
    gene_list[gene, "significance"] <- "Up"
  }  
  
  else if (gene_list[gene,"log2FoldChange"]<=(-1) & gene_list[gene,"padj"]< 0.05){
    gene_list[gene, "significance"] <- "Down"
  } 
  
  else if (abs(gene_list[gene,"log2FoldChange"])<1 | gene_list[gene,"padj"]>= 0.05){
    gene_list[gene, "significance"] <- "Not significant"
  }
}

gene_list_sig_up_outlier <- gene_list %>% filter(log2FoldChange > 20)

GO enrichment up and down-regulated genes

# Get upregulated genes
dge_up <- gene_list %>% filter(significance == "Up")

# Get downregulated genes
dge_down <- gene_list %>% filter(significance == "Down")
GO_dge_up <- enrichGO(gene = dge_up$Gene, universe = gene_list$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_dge_up <- as.data.frame(GO_dge_up)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_up$Description <- strtrim(temp_GO_dge_up$Description, 50)
rownames(temp_GO_dge_up) <- 1:nrow(temp_GO_dge_up)
temp_GO_dge_up
GO_dge_down <- enrichGO(gene = dge_down$Gene, universe = gene_list$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_dge_down <- as.data.frame(GO_dge_down)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_down$Description <- strtrim(temp_GO_dge_down$Description, 50)
rownames(temp_GO_dge_down) <- 1:nrow(temp_GO_dge_down)
temp_GO_dge_down

Differential expression analysis with edgeR

Construct edgeR DGElist object

cts_edgeR <- cts
inflammation_status <- as.factor(coldata$inflammation)
inflammation_status <- relevel(inflammation_status, "Non-Inflamed")

dge <- DGEList(counts = cts_edgeR, group = inflammation_status)

Filter low-expressed genes

keep <- filterByExpr(y=dge)

dge <- dge[keep, , keep.lib.sizes=F]

Normalization

dge <- calcNormFactors(dge)
dge$samples

Estimate dispersion

dge <- estimateDisp(y = dge)
Using classic mode.
diff_expr <- exactTest(dge)
top_diff_expr <- topTags(diff_expr, n="Inf")
gene_list_edgeR <- as.data.frame(top_diff_expr)
gene_list_edgeR
gene_list_edgeR$Gene <- rownames(gene_list_edgeR)

gene_list_edgeR$significance <- NA
for (gene in 1:nrow(gene_list_edgeR)){
  if (gene_list_edgeR[gene,"logFC"]>=1 & gene_list_edgeR[gene,"FDR"]< 0.05){
    gene_list_edgeR[gene, "significance"] <- "Up"
  }  
  
  else if (gene_list_edgeR[gene,"logFC"]<=(-1) & gene_list_edgeR[gene,"FDR"]< 0.05){
    gene_list_edgeR[gene, "significance"] <- "Down"
  } 
  
  else if (abs(gene_list_edgeR[gene,"logFC"])<1 | gene_list_edgeR[gene,"FDR"]>= 0.05){
    gene_list_edgeR[gene, "significance"] <- "Not significant"
  }
}

GO enrichment up and down-regulated genes

# Get upregulated genes
dge_edgeR_up <- gene_list_edgeR %>% filter(significance == "Up")

# Get downregulated genes
dge_edgeR_down <- gene_list_edgeR %>% filter(significance == "Down")
dge_edgeR_up <- enrichGO(gene = dge_edgeR_up$Gene, universe = gene_list_edgeR$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_dge_edgeR_up <- as.data.frame(dge_edgeR_up)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_edgeR_up$Description <- strtrim(temp_GO_dge_edgeR_up$Description, 50)
rownames(temp_GO_dge_edgeR_up) <- 1:nrow(temp_GO_dge_edgeR_up)
temp_GO_dge_edgeR_up
GO_dge_edgeR_down <- enrichGO(gene = dge_edgeR_down$Gene, universe = gene_list_edgeR$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")

temp_GO_dge_edgeR_down <- as.data.frame(GO_dge_edgeR_down)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_edgeR_down$Description <- strtrim(temp_GO_dge_edgeR_down$Description, 50)
rownames(temp_GO_dge_edgeR_down) <- 1:nrow(temp_GO_dge_edgeR_down)
temp_GO_dge_edgeR_down